Salaries Data import and cleaning

Descrition of data cleaning:

  • reading in data

  • selecting and renaming the columns

  • since names are in last, first format, reorder them to be first last

  • make sure all “Jr”s go at the end

  • remove accents using stringi::stri_trans_general

salaries = read_excel(
  "data/MLB-Salaries 2000-24.xlsx", 
  sheet = "2022.xls",
  skip = 1) |>
  select(1:4) |>
  rename(position = "Pos'n",
         salary_2022 = "2022.0",
         service_time_yrs = "MLS",
         name = "Player") %>%
  mutate(name = str_split(name, ","),
         name = map(name, rev),       
         name = map_chr(name, str_c, collapse = " "),
         name = str_trim(name),
         name = str_replace_all(name, "[*#\\.,]", ""),
         junior = str_detect(name, "Jr"),
         name = if_else(junior == TRUE, str_remove(name, " Jr"), name),
         name = if_else(junior == TRUE, str_c(name, " Jr"), name),
         name = stringi::stri_trans_general(name,id = "Latin-ASCII")) |>
  select(-junior) %>%
  mutate(simple_position = str_split_i(position, "-", 1),
         simple_position = fct_relevel(simple_position,
                                       c("c", "1b", "2b", "3b", "ss",
                                         "lf", "cf", "rf", "inf", "of",
                                         "dh", "rhp", "lhp")),
         service_time_floor = floor(service_time_yrs))
## New names:
## • `` -> `...5`

Batting and Pitching data import and cleaning

Batting

Descrition of data cleaning:

  • reading in data and clean names

  • grouping by name and using a count to ensure we only end up with one of each name (slightly complicated due to interleague trades and potential for players with the same name)

  • remove whitespace, from names and use stringi again to remove accents so that the formatting exactly matches the salaries tibble

  • Filter out those with fewer than 100 plate appearances

batting = read_delim("data/2022 MLB Player Stats - Batting.csv", delim = ";",
                     locale = locale(encoding = "latin1")) |>
  janitor::clean_names() %>%
  group_by(name) |>
  mutate(name_count = n(),
         keep_row = case_when(name_count == 1 ~ TRUE,
                              name_count > 1 & tm == "TOT" ~ TRUE,
                              .default = FALSE)) |>
  filter(keep_row == TRUE) %>%
  mutate(name_count = n(),
         keep_row = case_when(name_count == 1 ~ TRUE,
                              name_count > 1 & lg == "MLB" ~ TRUE,
                              .default = FALSE)) |>
  ungroup() |>
  filter(keep_row == TRUE) %>%
  mutate(name = str_split(name, "\\s+"),
         name = map_chr(name, str_c, collapse = " "),
         name = str_trim(name),
         name = str_replace_all(name, "[*#\\.]", ""),
         name = stringi::stri_trans_general(name,id = "Latin-ASCII"))
## Rows: 992 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr  (3): Name, Tm, Lg
## dbl (26): Rk, Age, G, PA, AB, R, H, 2B, 3B, HR, RBI, SB, CS, BB, SO, BA, OBP...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
merged_batting <- salaries |>
  mutate(simple_position = str_split_i(position, "-", 1)) |>
  filter(!simple_position %in% c("rhp","lhp")) |>
  inner_join(batting, by = "name") |>
  filter(pa >= 100)

salary_not_in_batting <- anti_join(salaries, batting, by = "name")
batting_not_in_salary <- anti_join(batting, salaries, by = "name")

Pitching

(same data cleaning process as above)

pitching = read_delim("data/2022 MLB Player Stats - Pitching.csv", delim = ";",
                     locale = locale(encoding = "latin1")) |>
  janitor::clean_names() %>%
  group_by(name) |>
  mutate(name_count = n(),
         keep_row = case_when(name_count == 1 ~ TRUE,
                              name_count > 1 & tm == "TOT" ~ TRUE,
                              .default = FALSE)) |>
  filter(keep_row == TRUE) %>%
  mutate(name_count = n(),
         keep_row = case_when(name_count == 1 ~ TRUE,
                              name_count > 1 & lg == "MLB" ~ TRUE,
                              .default = FALSE)) |>
  ungroup() |>
  filter(keep_row == TRUE) %>%
  mutate(name = str_split(name, "\\s+"),
         name = map_chr(name, str_c, collapse = " "),
         name = str_trim(name),
         name = str_replace_all(name, "[*#\\.]", ""),
         name = stringi::stri_trans_general(name,id = "Latin-ASCII"))
## Rows: 1081 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr  (3): Name, Tm, Lg
## dbl (32): Rk, Age, W, L, W-L%, ERA, G, GS, GF, CG, SHO, SV, IP, H, R, ER, HR...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
merged_pitching <- inner_join(salaries, pitching, by = "name") |>
  filter(ip >= 20) |>
  separate(ip, into = c("ip", "ip_dec"), remove = FALSE, convert = TRUE) |>
  mutate(ip_dec_333 = ifelse(is.na(ip_dec), 0, ip_dec * 333)  ,
         ip_total = paste(ip, ip_dec_333, sep = "."),
         ip_total = as.numeric(ip_total)) |>
  select(-ip, -ip_dec, -ip_dec_333) |>
  separate(position, into = c("hand", "pitcher_type")) |>
  mutate(pitcher_type = ifelse(is.na(pitcher_type), "r", pitcher_type)) |>
  filter(hand == "lhp" | hand == "rhp")
## Warning in inner_join(salaries, pitching, by = "name"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 325 of `x` matches multiple rows in `y`.
## ℹ Row 265 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 172 rows [5, 6, 10, 12,
## 14, 22, 24, 27, 30, 32, 33, 34, 36, 38, 40, 41, 42, 43, 45, 47, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 336 rows [24, 31, 34, 40,
## 44, 46, 49, 53, 55, 58, 59, 63, 65, 68, 70, 72, 77, 79, 81, 82, ...].
salary_not_in_pitching <- anti_join(salaries, pitching, by = "name")
pitching_not_in_salary <- anti_join(pitching, salaries, by = "name")

Note that there are a number of names in salaries not found in batting or pitching, and vice versa. Some of this may be genuine missingness (e.g. salaries has fewer rows than pitching and batting combined), but some is also due to alternative spellings of names in the datasets. Possibly could be fixed using other matching methods, but the analysis pipeline will still work from here.

Salaries EDA

Some Basic EDA for salaries on their own

Salaries by Position Boxplots

salaries %>% 
  ggplot(aes(x=simple_position,y=salary_2022)) +
  geom_boxplot()
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Salaries by Experience

salaries %>%
  ggplot(aes(x=floor(service_time_yrs),y=salary_2022)) +
  geom_col() 
## Warning: Removed 295 rows containing missing values or values outside the scale range
## (`geom_col()`).

Positions by Experience

salaries %>%
  ggplot(aes(x=floor(service_time_yrs),fill = simple_position)) +
  geom_bar(position = "fill") 

Salary Distribution by Experience

salaries %>%
  ggplot(aes(x=factor(floor(service_time_yrs)), y=salary_2022)) +
  geom_boxplot()
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Batting EDA

Distributions of variables

Start with some descriptive statistics/plots

Service time (i.e. number of years of experience)

merged_batting |>
  ggplot(aes(x = service_time_yrs)) + 
  geom_density() +
  labs(x = "Service time")

This is left skewed, so may need to transform it if used in regression

Salary

merged_batting |>
  ggplot(aes(x = salary_2022)) + 
  geom_density() + 
  labs(x = "Salary")
## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_density()`).

This is also heavily left skewed, with many players having salary under $1,000,000

Salary and service time are likely related, since rookies have contract minimums

merged_batting |>
  ggplot(aes(x = service_time_yrs, y = salary_2022)) + 
  geom_point(alpha = 0.5) 
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).

Position type

merged_batting |>
  mutate(positions = str_split_i(position, "-",1))  |>
  count(positions) |> 
  arrange(-n) |>
  knitr::kable()
positions n
2b 64
c 59
ss 54
3b 46
1b 40
of 38
cf 33
rf 30
lf 25
dh 7
inf 2

Since we filtered on plate appearances, most players in the batting dataset are position players (i.e., not pitchers). If we use this in regression analysis, we may want to filter out pitchers entirely.

Looking at some hitting statistics

merged_batting |> 
  ggplot(aes(x = hr)) + 
  geom_histogram() +
  labs(x = "Number of home runs (HRs)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

merged_batting |> 
  ggplot(aes(x = rbi)) + 
  geom_histogram() +
  labs(x = "Runs batted in (RBI)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

These are also left skewed so will need to normalize for any linear model.

OPS is an “all in one” statistic combining on-base percentage (OBP) and slugging (SLG).

OBP is calculated as \(\frac{Hits (H) + Walks (BB) + Hit by pitch (HBP)}{At \ bats (AB) + Walks (BB) + sacrifice \ flies (SF) + Hit by pitch (HBP)}\)

Slugging is calculated as \(\frac{total \ bases (TB)}{At \ bats (AB)}\)

OPS is the sum of these two statistics.

OPS+ (or adjusted OPS) is adjusted for the park and league averages.

merged_batting |> 
  ggplot(aes(x = ops)) + 
  geom_histogram() +
  labs(x = "OPS")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

merged_batting |> 
  ggplot(aes(x = ops_2)) + 
  geom_histogram() +
  labs(x = "OPS+")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

One idea might be to compare OPS or OPS+ to more traditional statistics, like RBIs, HRs, or batting averages to OPS or OPS+ as single predictors of salary.

One other factor to consider is the correlation between some of these variables

merged_batting |>
  select(hr, rbi, pa, ba, ops) |>
  GGally::ggpairs()
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

Some of these potential predictors are fairly strongly correlated (like RBI and HR, or BA and OPS), so it’s important not to include to many collinear variables in a potential linear model.

Relationship to salary

Looking at some relationships to salary

Position vs. salary

merged_batting |>
  mutate(positions = str_split_i(position, "-",1))  |>
  ggplot(aes(x = positions,y= salary_2022)) +
  geom_boxplot()
## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

merged_batting |>
  mutate(positions = str_split_i(position, "-",1))  |>
  group_by(positions) |>
  summarize(avg_salary = mean(salary_2022, na.rm = TRUE),
            median_salary = median(salary_2022, na.rm = TRUE))
## # A tibble: 11 × 3
##    positions avg_salary median_salary
##    <chr>          <dbl>         <dbl>
##  1 1b          8845755.       6225000
##  2 2b          5010094.       2550000
##  3 3b          8302384.       3375000
##  4 c           3669983.       1750000
##  5 cf          8145270.       5500000
##  6 dh         11857143.      12000000
##  7 inf         1312500        1312500
##  8 lf          6971404.       6125000
##  9 of          2211196.        746100
## 10 rf          7833411.       4875000
## 11 ss          7266837        4000000

HR vs. salary

merged_batting |>
  ggplot(aes(x = hr, y = salary_2022)) + 
  geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).

RBI vs. salary

merged_batting |>
  ggplot(aes(x = rbi, y = salary_2022)) + 
  geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).

BA vs. salary

merged_batting |>
  ggplot(aes(x = ba, y = salary_2022)) + 
  geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).

Plate appearances vs. salary

merged_batting |>
  ggplot(aes(x = pa, y = salary_2022)) + 
  geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).

OPS+ vs salary

merged_batting |>
  ggplot(aes(x = ops_2, y = salary_2022)) + 
  geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).

Regression models

Pitching EDA

In order to eliminate pitchers with very few appearances or position players (catchers, first basemen, etc.) who came in to pitch in blowouts, I filtered only for pitchers with at least 10 innings pitches.

Distribution of Innings Pitched

merged_pitching |> 
  ggplot(aes(x = ip_total)) + 
  geom_histogram(binwidth = 2, fill = "lightblue", col = "black")

Distribution of ERA+

merged_pitching |>
  ggplot(aes(x = era_2)) + 
  geom_histogram(fill = "lightblue", col = "black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

IP vs. Salary Plot

merged_pitching |> 
  ggplot(aes(x = ip_total, y = salary_2022)) + 
  geom_point() + 
  scale_x_continuous(lim = c(0, 400))
## Warning: Removed 50 rows containing missing values or values outside the scale range
## (`geom_point()`).

ERA+ vs. Salary Plot

merged_pitching |> 
  ggplot(aes(x = era_2, y = salary_2022)) + 
  geom_point() +
  scale_x_continuous(lim = c(0, 400))
## Warning: Removed 51 rows containing missing values or values outside the scale range
## (`geom_point()`).

IP vs. Log Salary Plot

merged_pitching |> 
  ggplot(aes(x = ip_total, y = log(salary_2022))) + 
  geom_point() + 
  scale_x_continuous(lim = c(0, 400))
## Warning: Removed 50 rows containing missing values or values outside the scale range
## (`geom_point()`).

ERA+ vs. Log Salary Plot

merged_pitching |> 
  ggplot(aes(x = era_2, y = log(salary_2022))) + 
  geom_point() + 
  scale_x_continuous(lim = c(0, 400))
## Warning: Removed 51 rows containing missing values or values outside the scale range
## (`geom_point()`).

LMs

lm_ip = lm(salary_2022 ~ ip_total, data = merged_pitching)
summary(lm_ip)
## 
## Call:
## lm(formula = salary_2022 ~ ip_total, data = merged_pitching)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -7881584 -2346730 -1339157   542534 36868673 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   453710     505732   0.897     0.37    
## ip_total       41360       5280   7.833    4e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5394000 on 417 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.1283, Adjusted R-squared:  0.1262 
## F-statistic: 61.35 on 1 and 417 DF,  p-value: 4.002e-14
lm_ip_data = tibble(fitted.values(lm_ip), residuals(lm_ip))

names(lm_ip_data) = c("fitted", "resid")

lm_ip_data |>
  ggplot(aes(x = fitted, y = resid)) +
  geom_point()

view(lm_ip_data)
lm_era_2 = lm(salary_2022 ~ era_2, data = merged_pitching)
summary(lm_era_2)
## 
## Call:
## lm(formula = salary_2022 ~ era_2, data = merged_pitching)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4017039 -3088950 -2721275   385077 39372391 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3560272     674783   5.276 2.12e-07 ***
## era_2           2371       5297   0.448    0.655    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5776000 on 417 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.0004801,  Adjusted R-squared:  -0.001917 
## F-statistic: 0.2003 on 1 and 417 DF,  p-value: 0.6547
lm_era_2_data = tibble(fitted.values(lm_era_2), residuals(lm_era_2))

names(lm_era_2_data) = c("fitted", "resid")

lm_era_2_data |>
  ggplot(aes(x = fitted, y = resid)) +
  geom_point()

view(lm_era_2_data)

Log-Transformed LMs

lm_ip_log = lm(log(salary_2022) ~ ip_total, data = merged_pitching)
summary(lm_ip_log)
## 
## Call:
## lm(formula = log(salary_2022) ~ ip_total, data = merged_pitching)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8882 -0.7565 -0.4034  0.7288  3.1811 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.776177   0.097656 141.069  < 2e-16 ***
## ip_total     0.008032   0.001020   7.877 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.042 on 417 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.1295, Adjusted R-squared:  0.1274 
## F-statistic: 62.05 on 1 and 417 DF,  p-value: 2.933e-14
lm_ip_log_data = tibble(fitted.values(lm_ip_log), residuals(lm_ip_log))

names(lm_ip_log_data) = c("fitted", "resid")

lm_ip_log_data |>
  ggplot(aes(x = fitted, y = resid)) +
  geom_point()

view(lm_ip_log_data)
lm_era_2_log = lm(log(salary_2022) ~ era_2, data = merged_pitching)
summary(lm_era_2_log)
## 
## Call:
## lm(formula = log(salary_2022) ~ era_2, data = merged_pitching)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9979 -0.9559 -0.5614  0.8333  3.1417 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.441e+01  1.304e-01 110.497   <2e-16 ***
## era_2       1.869e-04  1.024e-03   0.183    0.855    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.116 on 417 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  7.994e-05,  Adjusted R-squared:  -0.002318 
## F-statistic: 0.03334 on 1 and 417 DF,  p-value: 0.8552
lm_era_2_data_log = tibble(fitted.values(lm_era_2_log), residuals(lm_era_2_log))

names(lm_era_2_data_log) = c("fitted", "resid")

lm_era_2_data_log |>
  ggplot(aes(x = fitted, y = resid)) +
  geom_point()

Final Pitching LM

lm_pitching = lm(salary_2022 ~ hand + pitcher_type + ip_total + era_2,
                 data = merged_pitching)

summary(lm_pitching)
## 
## Call:
## lm(formula = salary_2022 ~ hand + pitcher_type + ip_total + era_2, 
##     data = merged_pitching)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -8526308 -1750980  -765623   513642 34843562 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8090576    2079374   3.891 0.000116 ***
## handrhp        -457404     538042  -0.850 0.395747    
## pitcher_typer -7479814    1909520  -3.917 0.000105 ***
## pitcher_types -2238603    1982913  -1.129 0.259576    
## ip_total         16550       5768   2.870 0.004322 ** 
## era_2             4082       4633   0.881 0.378782    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4948000 on 413 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.2737, Adjusted R-squared:  0.2649 
## F-statistic: 31.12 on 5 and 413 DF,  p-value: < 2.2e-16